%matplotlib inline
import networkx as nx
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
raw_data = pd.read_csv('Full-2020_05_10-13_14_09-X_raw_disease_50_target_100_7217_by_101.csv')
raw_data.head()
disease_list = list(raw_data['Unnamed: 0'])
gene_list = list(raw_data.columns)[1:]
# processing names and codes
df_diseases_names = pd.read_csv('disease_list.csv')
d_codes = list(df_diseases_names.efo_id)
d_names = list(df_diseases_names.disease_full_name)
diseases_names = {d_codes[i]:d_names[i] for i in range(len(d_codes)) }
diseases_names['COVID'] = 'COVID'
df_genes_names = pd.read_csv('target_list.csv')
g_codes = list(df_genes_names.ensembl_id)
g_names = list(df_genes_names.hgnc_approved_symbol)
genes_names = {g_codes[i]:g_names[i] for i in range(len(g_codes)) }
genes_names['COVID'] = 'COVID'
# Data for Graph
data = []
for i in range(len(raw_data)):
disease = raw_data.iloc[i]
disease_name = diseases_names[disease['Unnamed: 0']]
edges = [(disease_name, genes_names[gene], disease[gene]) for gene in gene_list if disease[gene] != 0]
if len(edges) > 0:
data.extend(edges)
# Graph Creation
Graph = nx.Graph()
diseases = np.unique([data[i][0] for i in range(len(data))])
genes = np.unique([data[i][1] for i in range(len(data))])
Graph.add_nodes_from(diseases, label = 'Disease')
Graph.add_nodes_from(genes, label = 'Gene')
Graph.add_weighted_edges_from(data)
nodelist = {}
nodelist['Diseases'] = diseases
nodelist['Genes'] = genes
# score each disease was associated to covid
score = list(raw_data.COVID)
covid_score = {diseases_names[disease_list[i]]: score[i] for i in range(len(score))}
covid_score_list = [(d,covid_score[d]) for d in covid_score]
covid_score_list.sort(key=custom_sort, reverse = True)
print('Number of nodes: {}'.format(len(Graph.nodes)))
print('Number of edges: {}'.format(len(Graph.edges)))
# infos dict initialization
Graph_infos = {}
# Degree (number of edges at each node) for all nodes in order to have the main stats (max,min,meand,median,quantile) and plot
Graph_infos['Degrees'] = dict(nx.degree(Graph))
# only values
degrees_val = [Graph_infos['Degrees'][node] for node in Graph_infos['Degrees']]
#stats
degree_min = np.min(degrees_val)
print('Min degree connection: {}'.format(degree_min))
print('')
degree_max = np.max(degrees_val)
print('Max degree connection: {}'.format(degree_max))
print('')
degree_median = np.median(degrees_val)
print('Median degree connection: {}'.format(degree_median))
print('')
degree_mean = np.mean(degrees_val)
print('Mean degree connection: {}'.format(degree_mean))
print('')
degree_u_quant = np.quantile(degrees_val,0.75)
print('75% quantile degree connection: {}'.format(degree_u_quant))
print('')
degree_l_quant = np.quantile(degrees_val,0.25)
print('25% quantile degree connection: {}'.format(degree_l_quant))
print('')
# remove covid for histogram
degrees_val.remove(np.max(degrees_val))
# Degree histogram
plt.figure(figsize=(12,7))
sns.set(style = "whitegrid")
sns.distplot(degrees_val,axlabel="Degree",kde=False)
plt.legend()
plt.yscale('log')
plt.title("Edges Connection Histogram, lin - log", fontsize = 20) # for histogram title
plt.savefig('Degrees_full.png')
plt.legend()
# Degree histogram Diseases
degrees_val_disease = [Graph_infos['Degrees'][d] for d in diseases]
#stats
degree_min = np.min(degrees_val_disease)
print('Min degree connection: {}'.format(degree_min))
print('')
degree_max = np.max(degrees_val_disease)
print('Max degree connection: {}'.format(degree_max))
print('')
degree_median = np.median(degrees_val_disease)
print('Median degree connection: {}'.format(degree_median))
print('')
degree_mean = np.mean(degrees_val_disease)
print('Mean degree connection: {}'.format(degree_mean))
print('')
degree_u_quant = np.quantile(degrees_val_disease,0.75)
print('75% quantile degree connection: {}'.format(degree_u_quant))
print('')
degree_l_quant = np.quantile(degrees_val_disease,0.25)
print('25% quantile degree connection: {}'.format(degree_l_quant))
print('')
# remove covid for histogram
degrees_val_disease.remove(np.max(degrees_val_disease))
plt.figure(figsize=(16,9))
sns.set(style = "whitegrid")
sns.distplot(degrees_val_disease,axlabel="Degree",kde=False)
plt.axvline(degree_u_quant, color = 'red', linestyle = '--', label = '75% quantile: {}'.format(degree_u_quant))
plt.axvline(degree_l_quant, color = 'blue', linestyle = '--', label = '25% quantile: {}'.format(degree_l_quant))
plt.axvline(degree_median, color = 'green', linestyle = '--', label = 'Median: {}'.format(degree_median))
plt.axvline(degree_mean, color = 'yellow', linestyle = '--', label = 'Mean: {}'.format(round(degree_mean,2)))
plt.legend()
plt.yscale('log')
plt.title("Edges Connection Histogram Diseases, lin - log", fontsize = 20) # for histogram title
plt.savefig('Degrees_diseases.png')
plt.show()
# Degree histogram Genes
degrees_val_genes = [Graph_infos['Degrees'][g] for g in genes]
print(max(degrees_val_genes))
#stats
degree_min = np.min(degrees_val_genes)
print('Min degree connection: {}'.format(degree_min))
print('')
degree_max = np.max(degrees_val_genes)
print('Max degree connection: {}'.format(degree_max))
print('')
degree_median = np.median(degrees_val_genes)
print('Median degree connection: {}'.format(degree_median))
print('')
degree_mean = np.mean(degrees_val_genes)
print('Mean degree connection: {}'.format(degree_mean))
print('')
degree_u_quant = np.quantile(degrees_val_genes,0.75)
print('75% quantile degree connection: {}'.format(degree_u_quant))
print('')
degree_l_quant = np.quantile(degrees_val_genes,0.25)
print('25% quantile degree connection: {}'.format(degree_l_quant))
print('')
# remove covid for histogram
degrees_val_genes.remove(np.max(degrees_val_genes))
plt.figure(figsize=(16,9))
sns.set(style = "whitegrid")
sns.distplot(degrees_val_genes,axlabel="Degree",kde=False)
plt.axvline(degree_u_quant, color = 'red', linestyle = '--', label = '75% quantile: {}'.format(degree_u_quant))
plt.axvline(degree_l_quant, color = 'blue', linestyle = '--', label = '25% quantile: {}'.format(degree_l_quant))
plt.axvline(degree_median, color = 'green', linestyle = '--', label = 'Median: {}'.format(degree_median))
plt.axvline(degree_mean, color = 'yellow', linestyle = '--', label = 'Mean: {}'.format(round(degree_mean,2)))
plt.legend()
plt.yscale('log')
plt.title("Edges Connection Histogram Genes, yscale logarithmic", fontsize = 20) # for histogram title
plt.show()
Graph_centrality = {}
Graph_centrality = {}
deg_cen = dict(Graph.degree(Graph, weight = 'weight')) Graph_centrality['Degree Centrality'] = deg_cen
clo_cen = nx.closeness_centrality(Graph) Graph_centrality['Closeness Centrality'] = clo_cen
bw_cen = nx.betweenness_centrality(Graph, weight = 'weight') Graph_centrality['Betweeness Centrality'] = bw_cen
pr_cen = nx.pagerank(Graph, weight = 'weight') Graph_centrality['Pagerank Centrality'] = pr_cen
df_cen = pd.DataFrame.from_dict(Graph_centrality)
pd.DataFrame.to_csv(df_cen,'Centralities_weighted.csv')
centralities_w = pd.read_csv('Centralities_weighted.csv')
centralities_w_dict = centralities_w.to_dict('series')
centralities_w.head()
centrality_w_names = list(centralities_w.columns)[1:]
disease_codes = list(centralities_w['Column1'])
Graph_centrality_w = {}
for col in centrality_w_names:
Graph_centrality_w[col] = {disease_codes[i]: centralities_w_dict[col][i] for i in range(len(disease_codes))}
# Set appropriate names to genes since not preent in dataframe loaded
for cen in Graph_centrality_w:
nodes = list(Graph_centrality_w[cen].keys())
for node in nodes:
try:
label = Graph.nodes[node]['label']
except KeyError:
print('error')
gene_name = genes_names[node]
Graph_centrality_w[cen][gene_name] = Graph_centrality_w[cen][node]
print(Graph_centrality_w[cen][gene_name])
del Graph_centrality_w[cen][node]
# in order to sort dicitonaries
import operator
#list to dict function
def list_to_dict(a):
d = {a[j][0]:a[j][1] for j in range(len(a)) }
return d
def custom_sort(t):
return t[1]
# print top 10 nodes for given centrality measure
def top_print(name,centrality,n, nodelist, label):
# sort the centrality to have the best above
centrality_label = [(node, centrality[node]) for node in nodelist]
centrality_label.sort(key=custom_sort, reverse = True)
nodes = [node[0] for node in centrality_label][:n]
print('-------------------------------------------')
print("{}: WEIGHTED CENTRALITY MEASURE BY {}".format(label.upper(), name.upper()))
print(' ')
for k in range(0,n):
# depends for decimals or not
if centrality_label[k][1] > 1:
print("{}. {}: {:.0F}".format(k+1,centrality_label[k][0].upper(),centrality_label[k][1]))
else:
print("{}. {}: {:.5F}".format(k+1,centrality_label[k][0].upper(),centrality_label[k][1]))
k = k+1
return nodes
#execute function for diseases
results = {}
for k in Graph_centrality_w:
centrality = Graph_centrality_w[k]
name = k
results[k] = top_print(name,centrality,10,list(diseases), 'Disease' )
#execute function for diseases
results = {}
for k in Graph_centrality_w:
centrality = Graph_centrality_w[k]
name = k
results[k] = top_print(name,centrality,10,list(genes), 'Genes' )
def overall_ranking(G, G_centrality, n_top, G_infos, nodelist):
''' G: Graph
G_centrality: centralities dict
n_top (int): number of nodes to be printed
G_infos (dict)'''
centralities = list(G_centrality.keys())
# dict in order to have the ranking {best node 1: 1, best node 2: 2} for each centrality
centrality_ranked = {}
# for each key in the overall_rank dict, have the ranking they have in each centrality
overall_rank = {n: [] for n in nodelist}
for cen in centralities:
centrality_label = {n: G_centrality[cen][n] for n in nodelist}
ranking = sorted(centrality_label.items(),key = operator.itemgetter(1),reverse = True)
centrality_ranked[cen] = {ranking[n][0]: n+1 for n in range(0,len(ranking))}
for node in nodelist:
overall_rank[node].append(centrality_ranked[cen][node])
# sum the points each nodes earned in the ranking in the centralities
total = {node: np.sum(overall_rank[node]) for node in nodelist}
# sort in order to have the n_tops nodes in increasing order
total_ranking = sorted(total.items(),key = operator.itemgetter(1),reverse = False)
top_nodes = [total_ranking[i][0] for i in range(n_top)]
results = {}
print('OVERALL RANKING')
for i in range(len(top_nodes)):
node = top_nodes[i]
results[node] = {}
print('')
print('------------------')
print('Node: {}'.format(node.upper()))
print('')
print('No: {} in centrality overall ranking'.format(i+1))
results[node]['Overall Ranking'] = i+1
print('')
print('Specificity:')
print('')
print('Number of connections: {}'.format(G_infos['Degrees'][node]))
results[node]['Number of connections'] = G_infos['Degrees'][node]
print('')
for j in range(len(centralities)):
print('No: {} in {} ranking'.format(overall_rank[node][j], str(centralities[j])))
results[node][str(centralities[j])] = overall_rank[node][j]
return centrality_ranked, total_ranking
centrality_ranked_d,total_ranking_d = overall_ranking(Graph, Graph_centrality_w,10, Graph_infos,diseases)
centrality_ranked_g,total_ranking_g = overall_ranking(Graph, Graph_centrality_w,10, Graph_infos,genes)
plt.figure(figsize=(15,15))
# Load
layout = np.load('layout_nx.npy',allow_pickle='TRUE').item()
# draw base etwork
nx.draw_networkx(Graph, pos = layout, node_size = 30, with_labels = False, node_color = 'grey', edge_color = '0.7')
# add bigger nodes in color
nodelist = list(Graph.nodes())
nodelist_d = list(diseases)
nodelist_g = list(genes)
nx.draw_networkx_nodes(Graph,pos=layout,nodelist= nodelist_d, node_color='red', node_size = 30)
nx.draw_networkx_nodes(Graph,pos=layout,nodelist= nodelist_g, node_color='green', node_size = 30)
#plt.title(name + ': '+node_type, fontsize = 20)
plt.savefig('full_network.png')
plt.axis("off")
plt.show()
def draw_graph_top(G,centrality_ranked,n, layout, name, node_list, node_type):
plt.figure(figsize=(15,15))
# draw base etwork
nx.draw_networkx(G, pos = layout, node_size = 30, with_labels = False, node_color = 'grey', edge_color = '0.7')
# add bigger nodes in color
top_nodes = list(centrality_ranked.keys())[:n]
labels = {i: i for i in top_nodes}
colors = ['#1ECD3F', '#0C437E', '#30F77B', '#E0016D', '#E0C6F4', '#A402CF', '#6C5020', '#014F05', '#3636DE', '#0E82F6']
big_size = 3000
sizes = [big_size/(i+1) for i in range(n)]
nx.draw_networkx_nodes(G,pos=layout,nodelist= top_nodes, with_labels = True, node_color=colors, node_size = sizes)
nx.draw_networkx_labels(G,layout, labels,font_size=16, font_color = 'k')
plt.title(name + ': '+node_type, fontsize = 20)
plt.savefig(name+'_'+node_type+'_centrality.png')
plt.axis("off")
plt.show()
# Load
layout = np.load('layout_nx.npy',allow_pickle='TRUE').item()
for cen in centrality_ranked_g:
draw_graph_top(Graph,centrality_ranked_g[cen],10, layout, str(cen),genes, 'GENES')
for cen in centrality_ranked_d:
draw_graph_top(Graph,centrality_ranked_d[cen],10, layout, str(cen),diseases, 'DISEASE')
total_ranking_d
plt.figure(figsize=(15,15))
# draw base etwork
nx.draw_networkx(Graph, pos = layout, node_size = 30, with_labels = False, node_color = 'grey', edge_color = '0.7')
# add bigger nodes in color
top_nodes = [node[0] for node in total_ranking_g][:10]
labels = {i: i for i in top_nodes}
colors = ['#1ECD3F', '#0C437E', '#30F77B', '#E0016D', '#E0C6F4', '#A402CF', '#6C5020', '#014F05', '#3636DE', '#0E82F6']
big_size = 3000
sizes = [big_size/(i+1) for i in range(n)]
nx.draw_networkx_nodes(Graph,pos=layout,nodelist= top_nodes, with_labels = True, node_color=colors, node_size = sizes)
#nx.draw_networkx_labels(G,layout, labels,font_size=16, font_color = 'k')
plt.title('Overall ranking_g', fontsize = 20)
plt.savefig('overall_nx.png')
plt.axis("off")
plt.show()
raw_data_predicate = pd.read_csv('COVID_KG_sample.csv')
raw_data_predicate.head()
predicate = raw_data_predicate.predicate
predicate.value_counts()
predicate_type = np.unique( list(predicate))
data_full = {}
for pred in predicate_type:
sub_df = raw_data_predicate[raw_data.predicate == pred]
node_pairs = []
for i in range(len(sub_df)):
pair_edge = sub_df.iloc[i]
node_pairs.append((pair_edge['subject'],pair_edge['object']))
data_full[pred] = node_pairs
# edge list between diseases
dis_to_dis = [(diseases_names[link[0]],diseases_names[link[1]]) for link in data_full['hasGeneticClue']]
Graph2 = Graph.copy()
Graph2.add_edges_from(dis_to_dis)
#layout2 = nx.spring_layout(Graph2)
# Save
#np.save('layout2.npy', layout)
# Load
layout_dd = np.load('layout2.npy',allow_pickle='TRUE').item()
print('Number of nodes: {}'.format(len(Graph2.nodes)))
print('Number of edges: {}'.format(len(Graph2.edges)))
# Taking the previous overall score (total_ranking), create subrgaphs around each central node defined
def top_subgraphs(G, total_ranking, n):
sub_graphs = {}
top_diseases = [d[0] for d in total_ranking][:n]
edge_list = list(G.edges)
for d in top_diseases:
sub_graphs[d] = {}
sub_edglist = [link for link in edge_list if link[0] == d]
sub_graphs[d]['edgelist'] = sub_edglist
sub_graphs[d]['node_list'] = [link[1] for link in sub_edglist if link[0] == d]
sub_graphs[d]['gene_nodes'] = [link[1] for link in sub_edglist if Graph2.nodes[link[1]]['label'] == 'Gene']
sub_graphs[d]['disease_nodes'] = [link[1] for link in sub_edglist if Graph2.nodes[link[1]]['label'] == 'Disease']
sub_graphs[d]['Total Nodes'] = len(sub_graphs[d]['node_list'])
sub_graphs[d]['Total Genes'] = len(sub_graphs[d]['gene_nodes'])
sub_graphs[d]['Total Diseases'] = len(sub_graphs[d]['disease_nodes'])
extended = sub_graphs[d]['node_list'].append(d)
new_G = nx.Graph()
#new_G.add_nodes_from(sub_graphs[d]['node_list'])
new_G.add_edges_from(sub_graphs[d]['edgelist'])
sub_graphs[d]['layout'] = nx.spring_layout(new_G)
sub_graphs[d]['sub_graph'] = new_G
return sub_graphs
sub_graphs_top = top_subgraphs(Graph2, total_ranking_d, 5)
results = {}
for d in sub_graphs_top:
results[d] = {}
results[d]['Total Nodes'] = sub_graphs_top[d]['Total Nodes']
results[d]['Total Genes'] = sub_graphs_top[d]['Total Genes']
results[d]['Total Diseases'] = sub_graphs_top[d]['Total Diseases']
pd.DataFrame.from_dict(results).to_csv('top_nodes_comparison.csv')
sub_graph_covid = top_subgraphs(Graph2, covid_score_list, 5)
# Draw each central disease with its own network
def draw_disease_top(sub_graphs_top, name, classification):
plt.figure(figsize=(16,10))
G = sub_graphs_top['sub_graph']
layout = sub_graphs_top['layout']
# draw base etwork
nx.draw_networkx(G, pos = layout, node_size = 30, with_labels = False, node_color = 'grey', edge_color = '0.7')
# add bigger nodes in color
covid = 'COVID'
labels_minor = {i: i for i in sub_graphs_top['node_list']}
label_name = {name: name.upper()}
label_covid = {covid: covid}
diseases_nodes = sub_graphs_top['disease_nodes']
genes_nodes = sub_graphs_top['gene_nodes']
try:
del labels_minor[name]
except:
print('ok')
nx.draw_networkx_nodes(G,pos=layout,nodelist= genes_nodes, with_labels = True, node_color='green', node_size = 700)
nx.draw_networkx_nodes(G,pos=layout,nodelist= diseases_nodes, with_labels = True, node_color='red', node_size = 1000)
nx.draw_networkx_nodes(G,pos=layout,nodelist= [name], with_labels = True, node_color='yellow', node_size = 5000)
nx.draw_networkx_nodes(G,pos=layout,nodelist= [covid], with_labels = True, node_color='blue', node_size = 2200)
nx.draw_networkx_labels(G,layout, labels_minor, font_size=12, font_color = 'k')
nx.draw_networkx_labels(G,layout, label_name, font_size=15, font_color = 'k')
nx.draw_networkx_labels(G,layout, label_covid, font_size=15, font_color = 'white')
plt.title(name.upper(), fontsize = 20)
plt.savefig(classification + 'final' + str(name)+'.png')
plt.axis("off")
plt.show()
for d in sub_graphs_top:
draw_disease_top(sub_graphs_top[d], d, 'OVERALL')
for d in sub_graph_covid:
draw_disease_top(sub_graph_covid[d], d, 'COVID')
# group diseases in which category of diseases they belong to
generic = {}
for link in data_full['isASpecific']:
try:
generic[diseases_names[link[1]]].append(diseases_names[link[0]])
except:
generic[diseases_names[link[1]]] = []
generic[diseases_names[link[1]]].append(diseases_names[link[0]])
generic_names = list(generic.keys())
# rank them according to size
generic_sizes = [(gen, len(generic[gen])) for gen in generic]
generic_sizes_dict = {gen: len(generic[gen]) for gen in generic}
# rank them according to link to covid
generic_covid_score = []
generic_covid_score_dict = {}
for gen in generic:
belong = generic[gen]
scores = []
for n in belong:
try:
scores.append(covid_score[n])
except KeyError:
continue
generic_covid_score.append((gen, np.sum(scores)))
generic_covid_score_dict[gen] = np.sum(scores)
#sort
generic_sizes.sort(key=custom_sort, reverse = True)
generic_covid_score.sort(key=custom_sort, reverse = True)
# draw the clusters (categories of disease) around the covid on different paramters
def draw_covid_generic(generic_score, n, name, edge_factor, node_factor):
nodes_and_score = generic_score[:n]
nodes = [nodes_and_score[i][0] for i in range(len(nodes_and_score))]
w_edges = []
simple_edges = []
for node in nodes_and_score:
w_edges.append(('COVID', node[0], node[1]))
simple_edges.append(('COVID', node[0]))
G = nx.Graph()
G.add_weighted_edges_from(w_edges)
layout = nx.spring_layout(G)
plt.figure(figsize=(16,10))
# draw base etwork
nx.draw_networkx(G, pos = layout, node_size = 30, with_labels = False, node_color = 'grey', edge_color = '0.7')
# add bigger nodes in color
covid = 'COVID'
labels_minor = {nodes[i]: nodes[i] for i in range(len(nodes))}
label_covid = {covid: covid}
edge_size = [nodes_and_score[i][1]*edge_factor for i in range(len(nodes_and_score))]
node_size = [nodes_and_score[i][1]*node_factor for i in range(len(nodes_and_score))]
nx.draw_networkx_edges(G, pos = layout, edgelist = simple_edges, width = edge_size, edge_color = 'grey')
nx.draw_networkx_nodes(G,pos=layout,nodelist= nodes,node_size = node_size, with_labels = True, node_color='red')
nx.draw_networkx_nodes(G,pos=layout,nodelist= [covid], with_labels = True, node_color='blue', node_size = 3000)
nx.draw_networkx_labels(G,layout, labels_minor, font_size=12, font_color = 'k')
nx.draw_networkx_labels(G,layout, label_covid, font_size=15, font_color = 'white')
plt.title(name.upper(), fontsize = 20)
plt.savefig('final' + str(name)+'.png')
plt.axis("off")
plt.show()
draw_covid_generic(generic_covid_score, 10, 'COVID SCORE', edge_factor = 20, node_factor = 1e4)
draw_covid_generic(generic_sizes, 10, 'SIZE', edge_factor = 0.05, node_factor = 10)
normed_generic = [(gen, float(generic_covid_score_dict[gen]/generic_sizes_dict[gen])) for gen in generic_sizes_dict if generic_sizes_dict[gen] !=0]
normed_generic.sort(key=custom_sort, reverse = True)
draw_covid_generic(normed_generic, 30, 'NORMED', edge_factor = 3e2, node_factor = 5e4)